kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import re
4
5
  from dataclasses import dataclass
5
6
  from functools import lru_cache
@@ -181,8 +182,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
181
182
  import spacy
182
183
 
183
184
  if spacy_config.model_cache_dir:
184
- import os
185
-
186
185
  os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
187
186
 
188
187
  nlp = spacy.load(model_name)
@@ -3,10 +3,12 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._types import ExtractionResult, normalize_metadata
7
+ from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
8
+
6
9
  if TYPE_CHECKING:
7
10
  from pathlib import Path
8
11
 
9
- from kreuzberg import ExtractionResult
10
12
  from kreuzberg._types import ExtractionConfig
11
13
 
12
14
 
@@ -90,3 +92,39 @@ class Extractor(ABC):
90
92
  return mime_type in cls.SUPPORTED_MIME_TYPES or any(
91
93
  mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
92
94
  )
95
+
96
+ def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
97
+ """Apply quality post-processing to extraction result if enabled.
98
+
99
+ Args:
100
+ result: The raw extraction result
101
+
102
+ Returns:
103
+ Enhanced extraction result with quality improvements (if enabled)
104
+ """
105
+ # Only apply quality processing if enabled in config
106
+ if not self.config.enable_quality_processing:
107
+ return result
108
+
109
+ if not result.content:
110
+ return result
111
+
112
+ # Clean the content
113
+ cleaned_content = clean_extracted_text(result.content)
114
+
115
+ # Calculate quality score
116
+ quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
+
118
+ # Add quality metadata
119
+ enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
+ enhanced_metadata["quality_score"] = quality_score
121
+
122
+ # Return enhanced result
123
+ return ExtractionResult(
124
+ content=cleaned_content,
125
+ mime_type=result.mime_type,
126
+ metadata=normalize_metadata(enhanced_metadata),
127
+ chunks=result.chunks,
128
+ detected_languages=result.detected_languages,
129
+ tables=result.tables,
130
+ )
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from html import unescape
5
+ from typing import TYPE_CHECKING, Any, ClassVar
6
+
7
+ from anyio import Path as AsyncPath
8
+
9
+ from kreuzberg._extractors._base import Extractor
10
+ from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
11
+ from kreuzberg._types import ExtractionResult, normalize_metadata
12
+ from kreuzberg._utils._string import normalize_spaces
13
+ from kreuzberg._utils._sync import run_sync
14
+ from kreuzberg.exceptions import MissingDependencyError
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
+ # Import optional dependencies at module level with proper error handling
20
+ try:
21
+ import mailparse
22
+ except ImportError:
23
+ mailparse = None
24
+
25
+ try:
26
+ import html2text # type: ignore[import-not-found]
27
+ except ImportError:
28
+ html2text = None
29
+
30
+ # Compile regex pattern once at module level
31
+ _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
32
+
33
+
34
+ class EmailExtractor(Extractor):
35
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
36
+
37
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
+ return await run_sync(self.extract_bytes_sync, content)
39
+
40
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
41
+ content = await AsyncPath(path).read_bytes()
42
+ return await self.extract_bytes_async(content)
43
+
44
+ def _extract_email_headers(
45
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
46
+ ) -> None:
47
+ """Extract and process email headers."""
48
+ # Use single dict access where possible to avoid repeated lookups
49
+ subject = parsed_email.get("subject")
50
+ if subject:
51
+ metadata["subject"] = subject
52
+ text_parts.append(f"Subject: {subject}")
53
+
54
+ from_info = parsed_email.get("from")
55
+ if from_info:
56
+ from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
57
+ metadata["email_from"] = from_email
58
+ text_parts.append(f"From: {from_email}")
59
+
60
+ to_info = parsed_email.get("to")
61
+ if to_info:
62
+ if isinstance(to_info, list) and to_info:
63
+ to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
64
+ elif isinstance(to_info, dict):
65
+ to_email = to_info.get("email", "")
66
+ else:
67
+ to_email = str(to_info)
68
+ metadata["email_to"] = to_email
69
+ text_parts.append(f"To: {to_email}")
70
+
71
+ date = parsed_email.get("date")
72
+ if date:
73
+ metadata["date"] = date
74
+ text_parts.append(f"Date: {date}")
75
+
76
+ cc = parsed_email.get("cc")
77
+ if cc:
78
+ metadata["email_cc"] = cc
79
+ text_parts.append(f"CC: {cc}")
80
+
81
+ bcc = parsed_email.get("bcc")
82
+ if bcc:
83
+ metadata["email_bcc"] = bcc
84
+ text_parts.append(f"BCC: {bcc}")
85
+
86
+ def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
+ """Extract and process email body content."""
88
+ text_content = parsed_email.get("text")
89
+ if text_content:
90
+ text_parts.append(f"\n{text_content}")
91
+ return # If we have text, prefer it over HTML
92
+
93
+ html_content = parsed_email.get("html")
94
+ if html_content:
95
+ if html2text is not None:
96
+ # Use html2text if available (faster path)
97
+ h = html2text.HTML2Text()
98
+ h.ignore_links = True
99
+ h.ignore_images = True
100
+ converted_text = h.handle(html_content)
101
+ text_parts.append(f"\n{converted_text}")
102
+ else:
103
+ # Fallback: strip HTML tags and unescape entities
104
+ clean_html = _HTML_TAG_PATTERN.sub("", html_content)
105
+ clean_html = unescape(clean_html)
106
+ text_parts.append(f"\n{clean_html}")
107
+
108
+ def _extract_email_attachments(
109
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
110
+ ) -> None:
111
+ """Extract and process email attachments info."""
112
+ if parsed_email.get("attachments"):
113
+ attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
114
+ metadata["attachments"] = attachment_names
115
+ if attachment_names:
116
+ text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
117
+
118
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
119
+ if mailparse is None:
120
+ msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
121
+ raise MissingDependencyError(msg)
122
+
123
+ try:
124
+ parsed_email = mailparse.EmailDecode.load(content)
125
+ text_parts: list[str] = []
126
+ metadata: dict[str, Any] = {}
127
+
128
+ # Extract headers, body, and attachments
129
+ self._extract_email_headers(parsed_email, text_parts, metadata)
130
+ self._extract_email_body(parsed_email, text_parts)
131
+ self._extract_email_attachments(parsed_email, text_parts, metadata)
132
+
133
+ # Join efficiently
134
+ combined_text = "\n".join(text_parts)
135
+
136
+ return ExtractionResult(
137
+ content=normalize_spaces(combined_text),
138
+ mime_type=PLAIN_TEXT_MIME_TYPE,
139
+ metadata=normalize_metadata(metadata),
140
+ chunks=[],
141
+ )
142
+
143
+ except Exception as e:
144
+ msg = f"Failed to parse email content: {e}"
145
+ raise RuntimeError(msg) from e
146
+
147
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
148
+ content = path.read_bytes()
149
+ return self.extract_bytes_sync(content)
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg._extractors._base import Extractor
9
9
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._types import ExtractionResult
11
- from kreuzberg._utils._string import normalize_spaces, safe_decode
11
+ from kreuzberg._utils._string import safe_decode
12
12
  from kreuzberg._utils._sync import run_sync
13
13
 
14
14
  if TYPE_CHECKING:
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
26
26
  return await run_sync(self.extract_bytes_sync, content)
27
27
 
28
28
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
- result = html_to_markdown.convert_to_markdown(safe_decode(content))
30
- return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
29
+ # Use html-to-markdown with script/nav removal for better quality
30
+ result = html_to_markdown.convert_to_markdown(
31
+ safe_decode(content),
32
+ preprocess_html=True,
33
+ preprocessing_preset="aggressive",
34
+ remove_navigation=True,
35
+ remove_forms=True,
36
+ )
37
+
38
+ # Skip normalize_spaces since quality processing will handle whitespace
39
+ extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
40
+
41
+ # Apply quality processing which includes normalization
42
+ return self._apply_quality_processing(extraction_result)
31
43
 
32
44
  def extract_path_sync(self, path: Path) -> ExtractionResult:
33
45
  content = path.read_bytes()
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
3
7
  from typing import TYPE_CHECKING, ClassVar
4
8
 
5
9
  from anyio import Path as AsyncPath
@@ -7,6 +11,9 @@ from anyio import Path as AsyncPath
7
11
  from kreuzberg._extractors._base import Extractor
8
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
9
13
  from kreuzberg._ocr import get_ocr_backend
14
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
15
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
16
+ from kreuzberg._ocr._tesseract import TesseractConfig
10
17
  from kreuzberg._utils._tmp import create_temp_file
11
18
  from kreuzberg.exceptions import ValidationError
12
19
 
@@ -15,9 +22,6 @@ if TYPE_CHECKING: # pragma: no cover
15
22
 
16
23
  from kreuzberg._types import ExtractionResult
17
24
 
18
- import contextlib
19
- from pathlib import Path
20
-
21
25
 
22
26
  class ImageExtractor(Extractor):
23
27
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -56,13 +60,11 @@ class ImageExtractor(Extractor):
56
60
  if self.config.ocr_backend is None:
57
61
  raise ValidationError("ocr_backend is None, cannot perform OCR")
58
62
 
59
- return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
63
+ result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
64
+ return self._apply_quality_processing(result)
60
65
 
61
66
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
62
67
  """Pure sync implementation of extract_bytes."""
63
- import os
64
- import tempfile
65
-
66
68
  extension = self._get_extension_from_mime_type(self.mime_type)
67
69
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
68
70
 
@@ -80,43 +82,26 @@ class ImageExtractor(Extractor):
80
82
  if self.config.ocr_backend is None:
81
83
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
84
 
83
- from kreuzberg._types import ExtractionResult
85
+ backend = get_ocr_backend(self.config.ocr_backend)
84
86
 
85
87
  if self.config.ocr_backend == "tesseract":
86
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
87
- from kreuzberg._ocr._tesseract import TesseractConfig
88
-
89
- if isinstance(self.config.ocr_config, TesseractConfig):
90
- config = self.config.ocr_config
91
- else:
92
- config = TesseractConfig()
93
-
94
- results = process_batch_images_sync_pure([str(path)], config)
95
- if results:
96
- return results[0]
97
- return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
-
99
- if self.config.ocr_backend == "paddleocr":
100
- from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
101
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
102
-
88
+ config = (
89
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
90
+ )
91
+ result = backend.process_file_sync(path, **config.__dict__)
92
+ elif self.config.ocr_backend == "paddleocr":
103
93
  paddle_config = (
104
94
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
105
95
  )
106
-
107
- return paddle_process(path, paddle_config)
108
-
109
- if self.config.ocr_backend == "easyocr":
110
- from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
111
- from kreuzberg._ocr._easyocr import EasyOCRConfig
112
-
96
+ result = backend.process_file_sync(path, **paddle_config.__dict__)
97
+ elif self.config.ocr_backend == "easyocr":
113
98
  easy_config = (
114
99
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
115
100
  )
116
-
117
- return easy_process(path, easy_config)
118
-
119
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
101
+ result = backend.process_file_sync(path, **easy_config.__dict__)
102
+ else:
103
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
104
+ return self._apply_quality_processing(result)
120
105
 
121
106
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
122
107
  if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
@@ -1,8 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
4
5
  import re
6
+ import subprocess
5
7
  import sys
8
+ import tempfile
6
9
  from json import JSONDecodeError, loads
7
10
  from pathlib import Path
8
11
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
203
206
  Returns:
204
207
  ExtractionResult with the extracted text and metadata.
205
208
  """
206
- import os
207
- import tempfile
208
- from pathlib import Path
209
-
210
209
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
211
210
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
212
211
 
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
579
578
 
580
579
  def _validate_pandoc_version_sync(self) -> None:
581
580
  """Synchronous version of _validate_pandoc_version."""
582
- import subprocess
583
-
584
581
  try:
585
582
  if self._checked_version:
586
583
  return
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
625
622
 
626
623
  def _extract_metadata_sync(self, path: Path) -> Metadata:
627
624
  """Synchronous version of _handle_extract_metadata."""
628
- import os
629
- import subprocess
630
- import tempfile
631
-
632
625
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
633
626
  fd, metadata_file = tempfile.mkstemp(suffix=".json")
634
627
  os.close(fd)
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
663
656
 
664
657
  def _extract_file_sync(self, path: Path) -> str:
665
658
  """Synchronous version of _handle_extract_file."""
666
- import os
667
- import subprocess
668
- import tempfile
669
-
670
659
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
671
660
  fd, output_path = tempfile.mkstemp(suffix=".md")
672
661
  os.close(fd)
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
5
+ import tempfile
4
6
  from multiprocessing import cpu_count
5
7
  from pathlib import Path
6
8
  from re import Pattern
@@ -10,15 +12,21 @@ from typing import TYPE_CHECKING, ClassVar, cast
10
12
  import anyio
11
13
  import pypdfium2
12
14
  from anyio import Path as AsyncPath
15
+ from playa import parse
13
16
 
14
17
  from kreuzberg._extractors._base import Extractor
15
18
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
16
19
  from kreuzberg._ocr import get_ocr_backend
17
- from kreuzberg._playa import extract_pdf_metadata
20
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
21
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
22
+ from kreuzberg._ocr._tesseract import TesseractConfig
23
+ from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
18
24
  from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._utils._errors import create_error_context, should_retry
19
26
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
20
27
  from kreuzberg._utils._string import normalize_spaces
21
28
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
29
+ from kreuzberg._utils._table import generate_table_summary
22
30
  from kreuzberg._utils._tmp import create_temp_file
23
31
  from kreuzberg.exceptions import ParsingError
24
32
 
@@ -63,17 +71,30 @@ class PDFExtractor(Extractor):
63
71
  result.metadata = await extract_pdf_metadata(content_bytes)
64
72
 
65
73
  if self.config.extract_tables:
66
- from kreuzberg._gmft import extract_tables
67
-
68
- result.tables = await extract_tables(path, self.config.gmft_config)
74
+ # GMFT is optional dependency
75
+ try:
76
+ from kreuzberg._gmft import extract_tables
69
77
 
70
- return result
78
+ result.tables = await extract_tables(path, self.config.gmft_config)
79
+ except ImportError:
80
+ result.tables = []
81
+
82
+ # Enhance metadata with table information
83
+ if result.tables:
84
+ table_summary = generate_table_summary(result.tables)
85
+ result.metadata.update(
86
+ {
87
+ "table_count": table_summary["table_count"],
88
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
89
+ f"across {table_summary['pages_with_tables']} pages with "
90
+ f"{table_summary['total_rows']} total rows",
91
+ }
92
+ )
93
+
94
+ return self._apply_quality_processing(result)
71
95
 
72
96
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
73
97
  """Pure sync implementation of PDF extraction from bytes."""
74
- import os
75
- import tempfile
76
-
77
98
  fd, temp_path = tempfile.mkstemp(suffix=".pdf")
78
99
  try:
79
100
  with os.fdopen(fd, "wb") as f:
@@ -81,8 +102,6 @@ class PDFExtractor(Extractor):
81
102
 
82
103
  result = self.extract_path_sync(Path(temp_path))
83
104
 
84
- from kreuzberg._playa import extract_pdf_metadata_sync
85
-
86
105
  metadata = extract_pdf_metadata_sync(content)
87
106
  result.metadata = metadata
88
107
 
@@ -100,16 +119,21 @@ class PDFExtractor(Extractor):
100
119
 
101
120
  tables = []
102
121
  if self.config.extract_tables:
122
+ # GMFT is optional dependency
103
123
  try:
104
124
  from kreuzberg._gmft import extract_tables_sync
105
125
 
106
126
  tables = extract_tables_sync(path)
107
127
  except ImportError:
108
- pass
128
+ tables = []
129
+
130
+ # Use playa for better text structure preservation when not using OCR
131
+ if not self.config.force_ocr and self._validate_extracted_text(text):
132
+ text = self._extract_with_playa_sync(path, fallback_text=text)
109
133
 
110
134
  text = normalize_spaces(text)
111
135
 
112
- return ExtractionResult(
136
+ result = ExtractionResult(
113
137
  content=text,
114
138
  mime_type=PLAIN_TEXT_MIME_TYPE,
115
139
  metadata={},
@@ -117,6 +141,21 @@ class PDFExtractor(Extractor):
117
141
  chunks=[],
118
142
  )
119
143
 
144
+ # Enhance metadata with table information
145
+ if tables:
146
+ table_summary = generate_table_summary(tables)
147
+ result.metadata.update(
148
+ {
149
+ "table_count": table_summary["table_count"],
150
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
151
+ f"across {table_summary['pages_with_tables']} pages with "
152
+ f"{table_summary['total_rows']} total rows",
153
+ }
154
+ )
155
+
156
+ # Apply quality processing
157
+ return self._apply_quality_processing(result)
158
+
120
159
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
121
160
  """Check if text extracted from PDF is valid or corrupted.
122
161
 
@@ -155,8 +194,6 @@ class PDFExtractor(Extractor):
155
194
  Returns:
156
195
  A list of Pillow Images.
157
196
  """
158
- from kreuzberg._utils._errors import create_error_context, should_retry
159
-
160
197
  document: pypdfium2.PdfDocument | None = None
161
198
  last_error = None
162
199
 
@@ -228,8 +265,6 @@ class PDFExtractor(Extractor):
228
265
  Returns:
229
266
  The extracted text.
230
267
  """
231
- from kreuzberg._utils._errors import create_error_context
232
-
233
268
  document: pypdfium2.PdfDocument | None = None
234
269
  try:
235
270
  with pypdfium_file_lock(input_file):
@@ -283,7 +318,7 @@ class PDFExtractor(Extractor):
283
318
  text_parts = []
284
319
  for page in pdf:
285
320
  text_page = page.get_textpage()
286
- text = text_page.get_text_range()
321
+ text = text_page.get_text_bounded()
287
322
  text_parts.append(text)
288
323
  text_page.close()
289
324
  page.close()
@@ -309,9 +344,6 @@ class PDFExtractor(Extractor):
309
344
  bitmap.close()
310
345
  page.close()
311
346
 
312
- import os
313
- import tempfile
314
-
315
347
  image_paths = []
316
348
  temp_files = []
317
349
 
@@ -339,43 +371,44 @@ class PDFExtractor(Extractor):
339
371
 
340
372
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
341
373
  """Process PDF images with the configured OCR backend."""
342
- if self.config.ocr_backend == "tesseract":
343
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
344
- from kreuzberg._ocr._tesseract import TesseractConfig
374
+ backend = get_ocr_backend(self.config.ocr_backend)
375
+ paths = [Path(p) for p in image_paths]
345
376
 
346
- tesseract_config = (
377
+ if self.config.ocr_backend == "tesseract":
378
+ config = (
347
379
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
348
380
  )
349
- results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
350
- text_parts = [r.content for r in results]
351
- return "\n\n".join(text_parts)
352
-
353
- if self.config.ocr_backend == "paddleocr":
354
- from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
355
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
356
-
381
+ results = backend.process_batch_sync(paths, **config.__dict__)
382
+ elif self.config.ocr_backend == "paddleocr":
357
383
  paddle_config = (
358
384
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
359
385
  )
360
-
361
- text_parts = []
362
- for image_path in image_paths:
363
- result = paddle_process(Path(image_path), paddle_config)
364
- text_parts.append(result.content)
365
- return "\n\n".join(text_parts)
366
-
367
- if self.config.ocr_backend == "easyocr":
368
- from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
369
- from kreuzberg._ocr._easyocr import EasyOCRConfig
370
-
386
+ results = backend.process_batch_sync(paths, **paddle_config.__dict__)
387
+ elif self.config.ocr_backend == "easyocr":
371
388
  easy_config = (
372
389
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
373
390
  )
391
+ results = backend.process_batch_sync(paths, **easy_config.__dict__)
392
+ else:
393
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
394
+
395
+ text_parts = [r.content for r in results]
396
+ return "\n\n".join(text_parts)
397
+
398
+ def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
399
+ """Extract text using playa for better structure preservation."""
400
+ with contextlib.suppress(Exception):
401
+ content = path.read_bytes()
402
+ document = parse(content, max_workers=1)
374
403
 
375
404
  text_parts = []
376
- for image_path in image_paths:
377
- result = easy_process(Path(image_path), easy_config)
378
- text_parts.append(result.content)
379
- return "\n\n".join(text_parts)
405
+ for page in document.pages:
406
+ # Extract text while preserving structure
407
+ page_text = page.extract_text()
408
+ if page_text and page_text.strip():
409
+ text_parts.append(page_text)
410
+
411
+ if text_parts:
412
+ return "\n\n".join(text_parts)
380
413
 
381
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
414
+ return fallback_text